
library(hydroGOF)

data=read.csv("C:/in/myd_merge.csv")
data=subset(data,data$station!=3)

data1=data

model1=lm(pm25~aod+temp+avg_temp+avg_rh+avg_preci_24,data1)
#model1=lm(pm25~I(aod^2) + I(temp^2) + I(avg_temp^2) + I(avg_rh^2) + I(avg_preci_24^2) + aod*temp + aod*avg_temp + aod*avg_rh + aod*avg_preci_24 + temp*avg_temp + temp*avg_rh + temp*avg_preci_24 + avg_temp*avg_rh + avg_temp*avg_preci_24 + avg_rh*avg_preci_24 + aod + temp + avg_temp + avg_rh + avg_preci_24,data1)


pm25model=model1$fitted.values
model_re=sum(abs(data1$pm25-pm25model)/data1$pm25)/nrow(data1)*100
model_rmse=rmse(data1$pm25,pm25model)
model_r=cor(data1$pm25,pm25model)

print("Org Model:")
print(paste("Number of sample: ",nrow(data1)))
print(paste("R2:",round(model_r*model_r,3)))
print(paste("RMSE:",round(model_rmse,3)))
print(paste("RE:",round(model_re,3)))

nSample=nrow(data1)
nPredict= length(model1$coefficients)-1

cutoff=4/(nSample-nPredict-1)
cook_thresold=qf(0.2,nPredict+1,nSample-nPredict-1)
cookValue=cooks.distance(model1)
data1["cookValue"] = cookValue

data2=subset(data1,data1$cookValue<=cutoff)
data3=subset(data1,data1$cookValue<=cook_thresold)

model2=lm(pm25~aod+temp+avg_temp+avg_rh+avg_preci_24,data2)
#model2=lm(pm25~I(aod^2) + I(temp^2) + I(avg_temp^2) + I(avg_rh^2) + I(avg_preci_24^2) + aod*temp + aod*avg_temp + aod*avg_rh + aod*avg_preci_24 + temp*avg_temp + temp*avg_rh + temp*avg_preci_24 + avg_temp*avg_rh + avg_temp*avg_preci_24 + avg_rh*avg_preci_24 + aod + temp + avg_temp + avg_rh + avg_preci_24,data2)

pm25model=model2$fitted.values
model_re=sum(abs(data2$pm25-pm25model)/data2$pm25)/nrow(data2)*100
model_rmse=rmse(data2$pm25,pm25model)
model_r=cor(data2$pm25,pm25model)

print("Outlier 1:")
print(paste("Number of sample: ",nrow(data2)))
print(paste("R2:",round(model_r*model_r,3)))
print(paste("RMSE:",round(model_rmse,3)))
print(paste("RE:",round(model_re,3)))

data2["predict"] = model2$fitted.values

#write.csv(data2,"C:/out/mod_quad_4.csv")

 model3=lm(pm25~aod+temp+avg_temp+avg_rh+avg_preci_24,data3)
 #model3=lm(data3$pm25~data3$aod+data3$temp+data3$avg_temp+data3$avg_rh+data3$avg_preci_24)
 pm25model=model3$fitted.values
 model_re=sum(abs(data3$pm25-pm25model)/data3$pm25)/nrow(data3)*100
 model_rmse=rmse(data3$pm25,pm25model)
 model_r=cor(data3$pm25,pm25model)
 
 print("Cook D:")
 print(paste("Number of sample: ",nrow(data3)))
 print(paste("R2:",round(model_r*model_r,3)))
 print(paste("RMSE:",round(model_rmse,3)))
 print(paste("RE:",round(model_re,3)))

 write.csv(model3$coefficients,"C:/a/myd_coff.csv")
 